
/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	
NOTE: the do-files 00_master_wage_imp, 01_split_episodes, 04_merge_basic_BHP, 
05_educ_broad, 06_wages_assessment_ceiling, 07_wages_marginal, 
08_wages_deflation, and 10_wages_imputation build on the do-files offered by 
Heiko Stüber, Wolfgang Dauth, Johann Eppelsheimer (see blow) but were slightly 
adjusted to run the code on our sample of the BeH.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/


capture log close
if(${logfile}==1) log using ${log}\10_wages_imputation.log, replace


forv sample = 1/8 {

if `sample' == 1 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 1997 & jahr <= 1999
}

if `sample' == 2 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2000 & jahr <= 2002
}

if `sample' == 3 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2003 & jahr <= 2005
}

if `sample' == 4 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2006 & jahr <= 2008
}

if `sample' == 5 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2009 & jahr <= 2011
}

if `sample' == 6 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2012 & jahr <= 2014
}

if `sample' == 7 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2015 & jahr <= 2017
}

if `sample' == 8 {
	use "$temp\imp_wage_orig.dta", clear
	keep if jahr >= 2018 & jahr <= 2019
}



/*~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
	
	Imputation of right-censored wages
	
	2-Step imputation procedure, similar to Dustmann et al. (2009) and Card et al. (2013):
		- Step 1: based on Gartner (2005)
		- Step 2: imputation models including leave-one-out means
		
	Generates the variables:
		- cens: 1 if right-censored/imputed wage, 0 otherwise; (4 EUR below assessment ceiling)
		- wage: daily wage, not imputed, top-coded wages replaced by assessment ceiling (-4 EUR), deflated (2015)
		- wage_imp: imputed daily wage, deflated (2015)
	

	Author(s): Wolfgang Dauth, Johann Eppelsheimer, Heiko Stüber
	The authors thank Johannes Ludsteck for his valuable suggestions!
	
	Version: 1.0
	Created: 2022-11-24
	
	References:
	Card, D., J. Heining, and P. Kline (2013). Workplace heterogeneity and the rise of West German wage inequality. The Quarterly Journal of Economics 128 (3), 967-1015
	Dustmann, C., J. Ludsteck, and U. Schönberg (2009). Revisiting the German wage structure. The Quarterly Journal of Economics 124 (2), 843-881.
	Gartner, H. (2005).  The imputation of wages above the contribution limit with the German IAB employment sample. FDZ-Methodenreport 02/2005: http://doku.iab.de/fdz/reporte/2005/MR_2.pdf
	https://twitter.com/marxmatt/status/1104570847648456704?s=20

	Note: 
	This do-file is part of the Stata do-file collection offered as a 
	supplement to Stüber, Dauth, and Eppelsheimer (2023): A Guide to Preparing 
	the Sample of Integrated Labour Market Biographies (SIAB, version 7519 v1) 
	for Scientific Analysis. Journal for Labour Market Research, 57:7.
	https://doi.org/10.1186/s12651-023-00335-w
	
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~*/



********************************************************************************
* Modify assessment limit and flag censored wages
********************************************************************************

* substract 4 EUR from exact assessment limit (to make sure all cencored wages are covered by the imputation)
gen limit_assess4 = limit_assess_defl - (100 * 4/cpi)		// ... use deflated 4 Euros
gen ln_limit_assess4 = log(limit_assess4)
label variable limit_assess4 "Contribution assessment ceiling minus 4 EUR"
label variable ln_limit_assess4 "Log of contribution assessment ceiling minus 4 EUR"
compress

* flag censored wages:
gen byte cens = 0
replace cens = 1 if wage_defl > limit_assess4		// only flag BeH spells (quelle == 1)
label variable cens "1 if right-censored/imputed wage, 0 otherwise; (4 EUR below assessment ceiling)"

********************************************************************************
* Overview of censored wages
********************************************************************************

* overall censoring
if(${inspect}==1) tab cens, m

* censoring by education
if(${inspect}==1) tab cens educ, m
if(${inspect}==1) tab cens if educ == 3

* censorig of high-skilled workers by age groups
if(${inspect}==1) forvalues a = 25(5)60 {
	disp "subsample high-skilled: age < `a':"
	tab cens if educ == 3 & age > 18 & age < `a' 
}

* censoring by fulltime/part-time
if(${inspect}==1) tab cens teilzeit

* censoring by gender
if(${inspect}==1) tab cens sex_id

* censoring for men in fulltime employment
if(${inspect}==1) tab cens if sex_id == 0 & teilzeit == 0


********************************************************************************
* Prepare dependent variable: wage
********************************************************************************

* wage
gen wage = wage_defl
replace wage = limit_assess4 if wage_defl > limit_assess4
label variable wage "Daily wage, not imputed, top-coded wages replaced by assessment ceiling (-4 EUR)"
compress

* log-wage
gen ln_wage = log(wage)
compress

* log-wages with censored wages reported as missing
gen ln_wage_cens = ln_wage if cens == 0
compress


********************************************************************************
* Prepare control variables for imputation
********************************************************************************

gen age_sq = (age/10)^2 			// express age squared in 100 years (to have readable coefficients in the regressions)
compress
gen old  = age > 40					// dummy for "older" people
compress
gen age_old = age * old 		    // different age profiles for young and old workers
compress
gen age_sq_old = age_sq * old		// age_old squared
compress

* control variables for imputation:
global controls sex_id teilzeit old age age_sq age_old age_sq_old

* check for missings in control variables
if(${inspect}==1) foreach var of varlist $controls {
	disp "missings in `var':"
	count if missing(`var') 
}


********************************************************************************
* Prepare groups; by default: year, skill group and East/West (by default we assign Berlin to East Germany)
********************************************************************************

* start and end year 
sum jahr
global minY = r(min)
global maxY = r(max)

* education groups (for the imputation regard missings as low-skilled)
gen educ_tmp = educ
replace educ_tmp = 1 if missing(educ)
label values educ_tmp lblValEducen
compress

sum educ_tmp 
global minEduc = r(min)		// minimum level of education
global maxEduc = r(max)		// maximum level of education


********************************************************************************
* Step 1: imputation with observable characteristics (for details refer to Gartner (2005))
********************************************************************************

* sort and set seed
sort persnr spell
set seed 123

* empty variable for (imputed) log-wages
gen ln_wage_imp = .

* create temporal copy of the dataset on the computer's local drive. Loading data from there is considerably faster than loading data over the network.
tempfile	temp_beforeimp
save		"`temp_beforeimp'", replace

keep if missing(east)
replace ln_wage_imp = ln_wage
compress
save ${data}\temp_0.dta, replace   
   
* separately run Tobit-wage-regressions by all groups
local c=0								// Running variable that will index temporary datasets
forvalues y = $minY / $maxY {		// loop over years

	display "$S_DATE $S_TIME: Step 1, processing year `y' ..."		// show current status of computations

	forvalues ed = $minEduc/$maxEduc {	// loop over education groups
		forvalues ew = 0/1 {			// loop over East/West
			
			use "`temp_beforeimp'", clear
			keep if jahr == `y' & educ_tmp == `ed' & east == `ew' // Run tobit regression on small datasets rather than using if-condition. This reduces computation times tremendously!

			capture noisily intreg ln_wage ln_wage_cens $controls if marginal == 0 	// Tobit estimates with right-censored data (since ln_wage_cens has missings) (excluding marginal wages)
			
			capture noisily global sdi = e(sigma)									// save variance of residual from Tobit
			capture noisily predict xbn if e(sample), xb							// predict xb from observables
			
			capture noisily gen eta = (ln_limit_assess4 - xbn) / $sdi if e(sample)	// eta is a residual with standard normal distribution
			
			capture noisily gen ln_wage_tmp = ln_wage if cens == 0					// note: do not use if e(sample) here; otherwise uncensored wages of obs. with missing covariables get dropped
			capture noisily replace ln_wage_tmp = xbn + ///												// take xb from regression and add normally distributed random term
						$sdi * invnorm(normal(eta) + uniform()*(1-normal(eta))) ///		// with s.d. = sigma, that must be larger than social security contribution ceiling
						if e(sample) & cens == 1
			
			capture noisily replace ln_wage_imp = ln_wage_tmp
			
			capture noisily count if ln_wage_imp < ln_wage & e(sample)				// make sure the imputation produces no values below social security contribution ceiling
			capture noisily if r(N) > 0 disp in red r(N) " imputed wage(s) below censoring limit in year " `y' " and education group " `ed'  " and east = " `ew'
			
			capture noisily drop xbn eta ln_wage_tmp	// drop temporary variables
			
			local c=`c'+1
			capture noisily save ${data}\temp_`c'.dta, replace						// save data for year/education/east-combination temporarily on NAS
		}
	}
}

*Append individual parts
local files=`c'
use ${data}\temp_0.dta
forvalues c=1/`files' {
	capture noisily append using ${data}\temp_`c'.dta
	capture noisily erase ${data}\temp_`c'.dta
	}
erase ${data}\temp_0.dta


********************************************************************************
* Intermediate step: obtain LEAVE-ONE-OUT means of imputed wages 
* (for workers and plants; something like worker or plant fixed effects)
********************************************************************************

drop limit_assess_defl limit_assess4
drop cpi

* helper variable containing only ones
gen byte ones = 1

* worker mean-wages
bysort persnr (spell): egen obs = total(ones)			// number of observations per worker and quelle (source)
by persnr: egen ln_wage_sum = total(ln_wage_imp)			// sum wage over all employment episodes for each worker
gen ln_wage_mean_worker = (ln_wage_sum - ln_wage_imp)/(obs-1)	// subtract wage of respective episode (leave-one-out) and divide by number of other episodes
drop obs ln_wage_sum
compress

gen byte only_one_obs = missing(ln_wage_mean_worker)							// dummy if worker only observed once (and denominator above (obs-1) = 0)
sum ln_wage_imp
replace ln_wage_mean_worker = r(mean) if missing(ln_wage_mean_worker)	// set mean to sample mean if worker only observed once
*replace ln_wage_mean_worker = . if quelle != 1							// set non-BEH-spells to missing

* plant mean-wages
bysort jahr betnr (spell): egen obs = total(ones)				// number of worker episodes per plant and year
drop ones
by jahr betnr: egen ln_wage_sum = total(ln_wage_imp)			// sum wage over all employment episodes per plant and year
compress
gen ln_wage_mean_firm = (ln_wage_sum - ln_wage_imp)/(obs-1)		// subtract wage of respective worker (leave-one-out) and divide by number of other worker episodes
drop obs ln_wage_sum
compress

gen byte only_one_worker = missing(ln_wage_mean_firm)							// dummy if worker is firm's only employee
by jahr: egen ln_wage_sample = mean(ln_wage_imp)
replace ln_wage_mean_firm = ln_wage_sample if missing(ln_wage_mean_firm)	// set mean to sample mean if worker is firm's only employee
*replace ln_wage_mean_firm = . if quelle != 1								// set non-BEH-spells to missing

drop ln_wage_sample 
compress


	
********************************************************************************
* Step 2: extended imputation models including the leave-one-out means
********************************************************************************

* empty variable for (imputed) log-wages
gen ln_wage_imp2 = .

* create temporal copies of the dataset
tempfile	temp_beforeimp
save		"`temp_beforeimp'", replace

*Save observations that will be ignored in imputation 
keep if missing(east)
replace ln_wage_imp = ln_wage
save ${data}\temp_0.dta, replace   

* separately run Tobit-wage-regressions by all groups (including leave-one-out means)
local c=0							    	// Running variable that will index temporary datasets
forvalues y = $minY/$maxY {			// loop over years

	display "$S_DATE $S_TIME: Step 2, processing year `y' ..."

	forvalues ed = $minEduc/$maxEduc {		// loop over education groups
		forvalues ew = 0/1 {				// loop over East/West
		
			use "`temp_beforeimp'", clear
			keep if jahr == `y' & educ_tmp == `ed' & east == `ew'

			capture noisily intreg ln_wage ln_wage_cens ln_wage_mean_worker only_one_obs ln_wage_mean_firm only_one_worker $controls if marginal == 0
			
			capture noisily global sdi = e(sigma)									// save variance of residual from Tobit
			capture noisily predict xbn if e(sample), xb							// predict xb from observables
			
			capture noisily gen eta = (ln_limit_assess4 - xbn) / $sdi if e(sample)	// eta is a residual with s.d. = sigma
			
			capture noisily gen ln_wage_tmp = ln_wage if cens == 0					// note: do not use if e(sample) here; otherwise uncensored wages of obs. with missing covariables get dropped
			capture noisily replace ln_wage_tmp = xbn + ///  											// take xb from regression and add normally distributed random term
					 $sdi * invnorm(normal(eta) + uniform()*(1-normal(eta))) /// 		// with s.d. = sigma, that must be larger than social security contribution ceiling
					if e(sample) & cens==1
					
			capture noisily replace ln_wage_imp2 = ln_wage_tmp if jahr == `y' & educ_tmp == `ed' & east == `ew'
			
			capture noisily count if ln_wage_imp2 < ln_wage & e(sample)				// make sure imputation produces no values below social security contribution ceiling
			capture noisily if r(N) > 0 disp in red r(N) " imputed wage(s) below censoring limit in year " `y' " and education group " `ed'  " and east = " `ew'
			
			capture noisily drop xbn eta ln_wage_tmp		// drop temporal variables

			local c=`c'+1
			compress
			capture noisily save ${data}\temp_`c'.dta, replace						// save data for year/education/east-combination temporarily on NAS
		}
	}
}

*Append individual parts
local files=`c'
use ${data}\temp_0.dta
forvalues c=1/`files' {
	capture noisily append using ${data}\temp_`c'.dta
	capture noisily erase ${data}\temp_`c'.dta
	}
erase ${data}\temp_0.dta


********************************************************************************
* Imputed wages in levels
********************************************************************************

gen wage_imp_int = exp(ln_wage_imp)		// intermediate wages from step 1 (for comparison))
gen wage_imp = exp(ln_wage_imp2)		// final version of imputed daily wages
compress

label variable wage_imp_int "Intermediate version of imputed daily wage"
label variable wage_imp 	"Imputed daily wage, deflated (2015)"

********************************************************************************
* Minor adjustments:
* 	- Limit imputed wages at 10 * 99th percentile (in extremely rare cases imputed wages could by chance be inplausible high)
*	- Replace missing wages from the second stage by imputed wages from the first stage (in the very rare case that plant-means of wages are extremely low, invnorm(...) returns missing values due to a numeric overflow)
********************************************************************************

sum wage_imp, d
global maxWage = 10 * r(p99)
replace wage_imp_int = $maxWage if wage_imp_int > $maxWage & !missing(wage_imp_int)
replace wage_imp 	 = $maxWage if wage_imp > $maxWage & !missing(wage_imp)

replace wage_imp = wage_imp_int if missing(wage_imp)


********************************************************************************
* Overview/Comparison of wages and imputed wages
********************************************************************************

* summary statistics
if(${inspect}==1) sum wage*
if(${inspect}==1) sum wage* if cens == 1
if(${inspect}==1) sum wage* if jahr == $maxY
if(${inspect}==1) sum wage* if cens == 1 & jahr == $maxY

* correlations
if(${inspect}==1) corr wage*
if(${inspect}==1) corr wage* if cens == 1
if(${inspect}==1) corr wage* if cens == 0

********************************************************************************
* Clean up
********************************************************************************

keep wage_imp persnr spell jahr tentgelt ao_bula


********************************************************************************
* Save data
********************************************************************************

rename jahr year 
rename tentgelt tentgelt_orig 
rename wage_imp tentgelt
label var tentgelt "imputed daily wage"

keep persnr spell tentgelt tentgelt_orig ao_bula
compress

save "$temp\imp_wage`sample'.dta", replace
clear
}

use "$temp\imp_wage1.dta"
append using "$temp\imp_wage2.dta"
append using "$temp\imp_wage3.dta"
append using "$temp\imp_wage4.dta"
append using "$temp\imp_wage5.dta"
append using "$temp\imp_wage6.dta"
append using "$temp\imp_wage7.dta"
append using "$temp\imp_wage8.dta"

compress
save "$temp\imp_wage.dta", replace
clear

erase "$temp\imp_wage1.dta"
erase "$temp\imp_wage2.dta"
erase "$temp\imp_wage3.dta"
erase "$temp\imp_wage4.dta"
erase "$temp\imp_wage5.dta"
erase "$temp\imp_wage6.dta"
erase "$temp\imp_wage7.dta"
erase "$temp\imp_wage8.dta"
cap log close
